To predict the concrete strength using the data available in file concrete_data.xls. Apply feature engineering and model tuning to obtain 80% to 95% of R2score.
The data for this project is available in file https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/
Name -- Data Type -- Measurement -- Description
1. Univariate analysis (10 marks)
Import all necessary modules and load the data
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.linear_model import LinearRegression
warnings.filterwarnings("ignore") # Not always recommended, but jsut so our notebook looks clean for this activity
sns.set_style('darkgrid')
Load and review data
df = pd.read_csv("concrete.csv");
df.head(20)
Print the datatypes of each column and the shape of the dataset
df.shape
df.dtypes
df.dtypes.value_counts()
df.columns
Range
print(df.max() - df.min())
Print the descriptive statistics (min,max,count,25th percentile,50th percentile, 75th percentile values)
df.describe().transpose()
IQR -
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
Median
df.median()
df.skew() # to measure the skewness of every attribute
Find the number zeros in the column and the missing values
(df == 0).sum() # We have coloums which have zero values Slag, ash and Superplastic
Ash, Slag and Superplastic has a lot of zeros are they missing values and need to be imputed
df.isna().sum() # We don't have any missing values
Using univariate analysis check the individual attributes for their basic statistic such as central values, spread, tails etc.
#plt.figure(figsize=(16,9)) # set figure size.
cement = df['cement']
#plt.hist(cement, bins=50)
#plt.figure(figsize=(16,9)) # set figure size.
#sns.distplot(cement,bins=50)
sns.distplot(cement)
#sns.countplot(cement)
sns.violinplot(cement)
sns.boxplot(cement)
plt.show()
mean=cement.mean()
median=cement.median()
mode=cement.mode()
skew=cement.skew()
print('Mean: ',mean,'\nMedian: ',median,'\nMode: ',mode[0], '\nSkew: ',skew)
#plt.figure(figsize=(10,5)) # set the figure size
#plt.hist(cement,bins=100,color='lightblue') #Plot the histogram
plt.hist(cement,color='lightblue') #Plot the histogram
plt.axvline(mean,color='green',label='Mean') # Draw lines on the plot for mean median and the two modes we have in GRE Score
plt.axvline(median,color='blue',label='Median')
plt.axvline(mode[0],color='red',label='Mode')
#plt.axvline(mode[1],color='red',label='Mode2')
plt.xlabel('Cement') # label the x-axis
plt.ylabel('Frequency') # label the y-axis
plt.legend() # Plot the legend
plt.show()
#plt.figure(figsize=(16,9)) # set figure size.
#Cumulative Distribution
sns.distplot(cement, hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True))
#plt.figure(figsize=(16,9)) # set figure size.
slag = df['slag']
sns.distplot(slag)
sns.violinplot(slag)
sns.boxplot(slag)
plt.show()
#There are outliers we may have to treat them
mean=slag.mean()
median=slag.median()
mode=slag.mode()
skew=slag.skew()
print('Mean: ',mean,'\nMedian: ',median,'\nMode: ',mode[0], '\nSkew: ',skew)
#plt.figure(figsize=(10,5)) # set the figure size
#plt.hist(cement,bins=100,color='lightblue') #Plot the histogram
plt.hist(slag,color='lightblue') #Plot the histogram
plt.axvline(mean,color='green',label='Mean') # Draw lines on the plot for mean median and the two modes we have in GRE Score
plt.axvline(median,color='blue',label='Median')
plt.axvline(mode[0],color='red',label='Mode')
#plt.axvline(mode[1],color='red',label='Mode2')
plt.xlabel('Slag') # label the x-axis
plt.ylabel('Frequency') # label the y-axis
plt.legend() # Plot the legend
plt.show()
sns.distplot(slag, hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True))
#plt.figure(figsize=(16,9)) # set figure size.
ash = df['ash']
sns.distplot(ash)
sns.violinplot(ash)
sns.boxplot(ash)
plt.show()
sns.distplot(ash, hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True))
mean=ash.mean()
median=ash.median()
mode=ash.mode()
skew=ash.skew()
print('Mean: ',mean,'\nMedian: ',median,'\nMode: ',mode[0], '\nSkew: ',skew)
#plt.figure(figsize=(10,5)) # set the figure size
#plt.hist(cement,bins=100,color='lightblue') #Plot the histogram
plt.hist(ash,color='lightblue') #Plot the histogram
plt.axvline(mean,color='green',label='Mean') # Draw lines on the plot for mean median and the two modes we have in GRE Score
plt.axvline(median,color='blue',label='Median')
plt.axvline(mode[0],color='red',label='Mode')
#plt.axvline(mode[1],color='red',label='Mode2')
plt.xlabel('Ash') # label the x-axis
plt.ylabel('Frequency') # label the y-axis
plt.legend() # Plot the legend
plt.show()
#plt.figure(figsize=(16,9)) # set figure size.
water = df['water']
sns.distplot(water)
sns.violinplot(water)
sns.boxplot(water)
plt.show()
#There are outliers we may have to treat them
mean=water.mean()
median=water.median()
mode=water.mode()
skew=water.skew()
print('Mean: ',mean,'\nMedian: ',median,'\nMode: ',mode[0], '\nSkew: ',skew)
#plt.figure(figsize=(10,5)) # set the figure size
#plt.hist(cement,bins=100,color='lightblue') #Plot the histogram
plt.hist(water,color='lightblue') #Plot the histogram
plt.axvline(mean,color='green',label='Mean') # Draw lines on the plot for mean median and the two modes we have in GRE Score
plt.axvline(median,color='blue',label='Median')
plt.axvline(mode[0],color='red',label='Mode')
#plt.axvline(mode[1],color='red',label='Mode2')
plt.xlabel('Water') # label the x-axis
plt.ylabel('Frequency') # label the y-axis
plt.legend() # Plot the legend
plt.show()
sns.distplot(water, hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True))
#plt.figure(figsize=(16,9)) # set figure size.
superplastic = df['superplastic']
sns.distplot(superplastic)
sns.violinplot(superplastic)
sns.boxplot(superplastic)
plt.show()
#There are outliers we may have to treat them
mean=superplastic.mean()
median=superplastic.median()
mode=superplastic.mode()
skew=superplastic.skew()
print('Mean: ',mean,'\nMedian: ',median,'\nMode: ',mode[0], '\nSkew: ',skew)
#plt.figure(figsize=(10,5)) # set the figure size
#plt.hist(cement,bins=100,color='lightblue') #Plot the histogram
plt.hist(superplastic,color='lightblue') #Plot the histogram
plt.axvline(mean,color='green',label='Mean') # Draw lines on the plot for mean median and the two modes we have in GRE Score
plt.axvline(median,color='blue',label='Median')
plt.axvline(mode[0],color='red',label='Mode')
#plt.axvline(mode[1],color='red',label='Mode2')
plt.xlabel('Superplastic') # label the x-axis
plt.ylabel('Frequency') # label the y-axis
plt.legend() # Plot the legend
plt.show()
sns.distplot(superplastic, hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True))
#['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg','fineagg', 'age', 'strength']
#plt.figure(figsize=(16,9)) # set figure size.
coarseagg = df['coarseagg']
sns.distplot(coarseagg)
sns.violinplot(coarseagg)
sns.boxplot(coarseagg)
plt.show()
mean=coarseagg.mean()
median=coarseagg.median()
mode=coarseagg.mode()
skew=coarseagg.skew()
print('Mean: ',mean,'\nMedian: ',median,'\nMode: ',mode[0], '\nSkew: ',skew)
#plt.figure(figsize=(10,5)) # set the figure size
#plt.hist(cement,bins=100,color='lightblue') #Plot the histogram
plt.hist(coarseagg,color='lightblue') #Plot the histogram
plt.axvline(mean,color='green',label='Mean') # Draw lines on the plot for mean median and the two modes we have in GRE Score
plt.axvline(median,color='blue',label='Median')
plt.axvline(mode[0],color='red',label='Mode')
#plt.axvline(mode[1],color='red',label='Mode2')
plt.xlabel('Coarse Aggegate') # label the x-axis
plt.ylabel('Frequency') # label the y-axis
plt.legend() # Plot the legend
plt.show()
sns.distplot(coarseagg, hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True))
#plt.figure(figsize=(16,9)) # set figure size.
fineagg = df['fineagg']
sns.distplot(fineagg)
sns.violinplot(fineagg)
sns.boxplot(fineagg)
plt.show()
#There are outliers we may have to treat them
mean=fineagg.mean()
median=fineagg.median()
mode=fineagg.mode()
skew=fineagg.skew()
print('Mean: ',mean,'\nMedian: ',median,'\nMode: ',mode[0], '\nSkew: ',skew)
#plt.figure(figsize=(10,5)) # set the figure size
#plt.hist(cement,bins=100,color='lightblue') #Plot the histogram
plt.hist(fineagg,color='lightblue') #Plot the histogram
plt.axvline(mean,color='green',label='Mean') # Draw lines on the plot for mean median and the two modes we have in GRE Score
plt.axvline(median,color='blue',label='Median')
plt.axvline(mode[0],color='red',label='Mode')
#plt.axvline(mode[1],color='red',label='Mode2')
plt.xlabel('Fine Aggregate') # label the x-axis
plt.ylabel('Frequency') # label the y-axis
plt.legend() # Plot the legend
plt.show()
sns.distplot(fineagg, hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True))
#plt.figure(figsize=(16,9)) # set figure size.
age = df['age']
sns.countplot(age)
sns.distplot(age)
sns.violinplot(age)
sns.boxplot(age)
plt.show()
#There are outliers we may have to treat them
mean=age.mean()
median=age.median()
mode=age.mode()
skew=age.skew()
print('Mean: ',mean,'\nMedian: ',median,'\nMode: ',mode[0], '\nSkew: ',skew)
#plt.figure(figsize=(10,5)) # set the figure size
#plt.hist(cement,bins=100,color='lightblue') #Plot the histogram
plt.hist(age,color='lightblue') #Plot the histogram
plt.axvline(mean,color='green',label='Mean') # Draw lines on the plot for mean median and the two modes we have in GRE Score
plt.axvline(median,color='blue',label='Median')
plt.axvline(mode[0],color='red',label='Mode')
#plt.axvline(mode[1],color='red',label='Mode2')
plt.xlabel('Age') # label the x-axis
plt.ylabel('Frequency') # label the y-axis
plt.legend() # Plot the legend
plt.show()
sns.distplot(age, hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True))
#plt.figure(figsize=(16,9)) # set figure size.
strength = df['strength']
sns.distplot(df['strength'])
sns.violinplot(strength)
sns.boxplot(strength)
plt.show()
#There are outliers we may have to treat them
mean=strength.mean()
median=strength.median()
mode=strength.mode()
skew=strength.skew()
print('Mean: ',mean,'\nMedian: ',median,'\nMode: ',mode[0], '\nSkew: ',skew)
#plt.figure(figsize=(10,5)) # set the figure size
#plt.hist(cement,bins=100,color='lightblue') #Plot the histogram
plt.hist(strength,color='lightblue') #Plot the histogram
plt.axvline(mean,color='green',label='Mean') # Draw lines on the plot for mean median and the two modes we have in GRE Score
plt.axvline(median,color='blue',label='Median')
plt.axvline(mode[0],color='red',label='Mode')
#plt.axvline(mode[1],color='red',label='Mode2')
plt.xlabel('Strength') # label the x-axis
plt.ylabel('Frequency') # label the y-axis
plt.legend() # Plot the legend
plt.show()
sns.distplot(strength, hist_kws=dict(cumulative=True), kde_kws=dict(cumulative=True))
This is the target variable
sns.jointplot(x="cement", y="strength", data=df, kind="reg")
plt.show()
plt.figure(figsize=(20,5)) # set figure size.
sns.scatterplot(df['cement'], df['strength'])
plt.show()
#
plt.figure(figsize=(20,5))
sns.regplot(x='cement',y='strength', data=df ) # regression plot - scatter plot with a regression line
plt.show()
plt.figure(figsize=(20,5))
sns.lineplot(x='cement',y='strength', data=df )
plt.show()
sns.jointplot(x="slag", y="strength", data=df, kind="reg")
plt.show()
plt.figure(figsize=(20,5)) # set figure size.
sns.scatterplot(df['slag'], df['strength'])
plt.show()
# Looks like there are a lot of outliers here. We need to handle them.
plt.figure(figsize=(20,5))
sns.regplot(x='slag',y='strength', data=df ) # regression plot - scatter plot with a regression line
plt.show()
plt.figure(figsize=(20,5))
sns.lineplot(x='slag',y='strength', data=df )
plt.show()
sns.jointplot(x="water", y="strength", data=df, kind="reg")
plt.show()
plt.show()
#['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age', 'strength']
plt.figure(figsize=(20,5)) # set figure size.
sns.scatterplot(df['water'], df['strength'])
plt.show()
# Looks like there are a lot of outliers here. We need to handle them.
plt.figure(figsize=(20,5))
sns.regplot(x='water',y='strength', data=df )
plt.show()
plt.figure(figsize=(20,5))
sns.lineplot(x='water',y='strength', data=df )
plt.show()
sns.jointplot(x="ash", y="strength", data=df, kind="reg")
plt.show()
#['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age', 'strength']
plt.figure(figsize=(20,5)) # set figure size.
sns.scatterplot(df['ash'], df['strength'])
plt.show()
plt.figure(figsize=(20,5))
sns.regplot(x='ash',y='strength', data=df )
plt.show()
plt.figure(figsize=(20,5))
sns.lineplot(x='ash',y='strength', data=df )
plt.show()
sns.jointplot(x="superplastic", y="strength", data=df, kind="reg")
plt.show()
#['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age', 'strength']
plt.figure(figsize=(20,5)) # set figure size.
sns.scatterplot(df['superplastic'], df['strength'])
plt.show()
plt.figure(figsize=(20,5))
sns.regplot(x='superplastic',y='strength', data=df )
plt.show()
plt.figure(figsize=(20,5))
sns.lineplot(x='superplastic',y='strength', data=df )
plt.show()
sns.jointplot(x="coarseagg", y="strength", data=df, kind="reg")
plt.show()
#['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age', 'strength']
plt.figure(figsize=(20,5)) # set figure size.
sns.scatterplot(df['coarseagg'], df['strength'])
plt.show()
plt.figure(figsize=(20,5))
sns.regplot(x='coarseagg',y='strength', data=df )
plt.show()
plt.figure(figsize=(20,5))
sns.lineplot(x='coarseagg',y='strength', data=df )
plt.show()
sns.jointplot(x="fineagg", y="strength", data=df, kind="reg")
plt.show()
plt.figure(figsize=(20,5)) # set figure size.
sns.scatterplot(df['fineagg'], df['strength'])
plt.show()
plt.figure(figsize=(20,5))
sns.regplot(x='fineagg',y='strength', data=df )
plt.show()
plt.figure(figsize=(20,5))
sns.lineplot(x='fineagg',y='strength', data=df )
plt.show()
sns.jointplot(x="age", y="strength", data=df, kind="reg")
plt.show()
#['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age', 'strength']
plt.figure(figsize=(20,5)) # set figure size.
sns.scatterplot(df['age'], df['strength'])
plt.show()
plt.figure(figsize=(20,5))
sns.regplot(x='age',y='strength', data=df )
plt.show()
plt.figure(figsize=(20,5))
sns.lineplot(x='age',y='strength', data=df )
plt.show()
plt.figure(figsize=(20,5))
sns.boxplot(x="age", y="strength", data=df)
plt.show()
df.corr() # prints the correlation coefficient between every pair of attributes
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show()
sns.pairplot(df, diag_kind='kde')
plt.show()
Feature Engineering techniques – (10 markrs)
Prepare Data for analysis
Identify opportunities (if any) to extract a new feature from existing features, drop a feature (if required).
Get the data model ready and do a train test split.
Decide on complexity of the model, should it be simple linear model in terms of parameters or would a quadratic or higher degree
# Ash, Slag and Superplastic has a lot of zeros are they missing values and need to be imputed ?
#Domain Literatue says they can have zero values.
#Lets run an imputing experiment with KNN and Simple imputers and test along with the original data.
#['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age', 'strength']
df2 = df.copy()
cols=['slag','ash','superplastic']
df2[cols] = df[cols].replace({0:np.nan})
df2.isna().sum()
#Imputation for completing missing values using k-Nearest Neighbors.
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
df2[:] = imputer.fit_transform(df2)
df2.isna().sum()
(df2 == 0).sum() # We don't have zero values for Slag, ash and Superplastic anymore in the imputed dataframe
#from sklearn.impute import SimpleImputer
df3 = df.copy()
from sklearn.impute import SimpleImputer
rep_0 = SimpleImputer(missing_values=0, strategy="mean")
cols=['slag','ash','superplastic']
imputer = rep_0.fit(df3[cols])
df3[cols] = imputer.transform(df3[cols])
#Split Data
# independant variables
X = df.drop('strength', axis=1)
# the dependent variable
y = df[['strength']]
X2 = df2.drop('strength', axis=1)
# the dependent variable
y2 = df2[['strength']]
X3 = df3.drop('strength', axis=1)
# the dependent variable
y3 = df3[['strength']]
# Split X and y into training and test set in 70:30 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
#Imputed Data KNN
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.30, random_state=1)
#Imputed Data Simple Imputer
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.30, random_state=1)
#Imputed Data KNN
print("{0:0.2f}% data is in training set".format((len(X3_train)/len(df3.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(X3_test)/len(df3.index)) * 100))
#Imputed Data KNN
print("{0:0.2f}% data is in training set".format((len(X2_train)/len(df2.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(X2_test)/len(df2.index)) * 100))
print("{0:0.2f}% data is in training set".format((len(X_train)/len(df.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(X_test)/len(df.index)) * 100))
#Fit Linear Model for Imputed data (Simple)
regression_model3 = LinearRegression()
regression_model3.fit(X3_train, y3_train)
#The score (R^2) for in-sample and out of sample
print(regression_model3.score(X3_train, y3_train))
print(regression_model3.score(X3_test, y3_test))
#Fit Linear Model for Imputed data (KNN)
regression_model2 = LinearRegression()
regression_model2.fit(X2_train, y2_train)
#The score (R^2) for in-sample and out of sample
print(regression_model2.score(X2_train, y2_train))
print(regression_model2.score(X2_test, y2_test))
#Fit Linear Model
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
#The score (R^2) for in-sample and out of sample
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))
We have better scores with the original data
#Here are the coefficients for each variable and the intercept for the imputed data (KNN)
for idx, col_name in enumerate(X3_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model3.coef_[0][idx]))
#Here are the coefficients for each variable and the intercept for the imputed data (KNN)
for idx, col_name in enumerate(X2_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model2.coef_[0][idx]))
#Here are the coefficients for each variable and the intercept
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
#Let us generate polynomial models reflecting the non-linear interaction between some dimensions
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
#Imputed Data (Simple)
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X3_poly_train = poly.fit_transform(X3_train)
X3_poly_test = poly.fit_transform(X3_test)
poly_clf3 = linear_model.LinearRegression()
poly_clf3.fit(X3_poly_train, y3_train)
y3_pred = poly_clf3.predict(X3_poly_test)
#The score (R^2) for in-sample and out of sample for the imputed data.
#The scrores drop here when you compare the non imputed data above
print(poly_clf3.score(X3_poly_train, y3_train))
print(poly_clf3.score(X3_poly_test, y3_test))
#Imputed Data (KNN)
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X2_poly_train = poly.fit_transform(X2_train)
X2_poly_test = poly.fit_transform(X2_test)
poly_clf2 = linear_model.LinearRegression()
poly_clf2.fit(X2_poly_train, y2_train)
y2_pred = poly_clf2.predict(X2_poly_test)
#The score (R^2) for in-sample and out of sample for the imputed data.
#The scrores drop here when you compare the non imputed data above
print(poly_clf2.score(X2_poly_train, y2_train))
print(poly_clf2.score(X2_poly_test, y2_test))
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.fit_transform(X_test)
poly_clf = linear_model.LinearRegression()
poly_clf.fit(X_poly_train, y_train)
y_pred = poly_clf.predict(X_poly_test)
#The score (R^2) for in-sample and out of sample (our measure of sucess and does improve)
print(poly_clf.score(X_poly_train, y_train))
print(poly_clf.score(X_poly_test, y_test))
We have better scores with the poly data derived from the original data
# Polynomial models improve the scores at the cost of 29 extra variables!
print(X_train.shape)
print(X_poly_train.shape)
Working with Outliers: Correcting, Removing
Use previously calculated IQR score to filter out the outliers by keeping only valid values.
df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)] # rows without outliers
df_out.shape
concrete_df = df.copy()
# Replace every outlier on the lower side by the lower whisker
for i, j in zip(np.where(concrete_df < Q1 - 1.5 * IQR)[0], np.where(concrete_df < Q1 - 1.5 * IQR)[1]):
whisker = Q1 - 1.5 * IQR
concrete_df.iloc[i,j] = whisker[j]
#Replace every outlier on the upper side by the upper whisker
for i, j in zip(np.where(concrete_df > Q3 + 1.5 * IQR)[0], np.where(concrete_df > Q3 + 1.5 * IQR)[1]):
whisker = Q3 + 1.5 * IQR
concrete_df.iloc[i,j] = whisker[j]
concrete_df.shape, df.shape
sim_df = concrete_df.copy()
features = [col for col in df.columns if col != 'strength']
X = sim_df.drop("strength" , axis=1)
y = sim_df.pop("strength")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1)
Lets check split of data
print("{0:0.2f}% data is in training set".format((len(X_train)/len(df.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(X_test)/len(df.index)) * 100))
(Creating the model and tuning it)
1. Algorithms that you think will be suitable for this project (at least 3 algorithms). Use Kfold Cross Validation to evaluate model performance. Use appropriate metrics and make a DataFrame to compare models w.r.t their metrics. (15 marks)
#Fit Linear Model for Imputed data
lr = LinearRegression()
lr.fit(X_train, y_train)
#The score (R^2) for in-sample and out of sample
print(lr.score(X_train, y_train))
#Test accuracy calculation
lr_score = lr.score(X_test, y_test)
print(lr_score)
#Let us generate polynomial models reflecting the non-linear interaction between some dimensions
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.fit_transform(X_test)
poly_clf = linear_model.LinearRegression()
poly_clf.fit(X_poly_train, y_train)
y_pred = poly_clf.predict(X_poly_test)
#The score (R^2) for in-sample and out of sample (our measure of sucess and does improve)
print(poly_clf.score(X_poly_train, y_train))
print(poly_clf.score(X_poly_test, y_test))
k-fold cross validation( without stratification)
Usually k is set as 10-20 in practical settings, depends on data set size
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=seed)
#Linear Regression
lr_results = cross_val_score(poly_clf,X, y, cv=kfold)
lr_results
kfm = np.mean(abs(lr_results))
kfstd = lr_results.std()
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)
# The mean squared error
print('Mean squared error: %.2f'%mse)
# Explained variance score: 1 is perfect prediction
print('Test Variance score(R2 Score): %.2f'% r2)
print('Root Mean Square Error: %.2f'%rmse)
resultsDf = pd.DataFrame({'Method':['Linear Regression'],'MSE': mse,'RMSE': rmse,'R2': r2,'K-Fold-M': kfm,'K-Fold-STD':kfstd})
# So let's run the model against the test data
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
ax.set_title("Ground Truth vs Predicted")
plt.show()
#Decision Tree Regressor
# import the regressor
from sklearn.tree import DecisionTreeRegressor
# create a regressor object
dt = DecisionTreeRegressor(random_state = 0)
# fit the regressor with X and Y data
dtm = dt.fit(X_poly_train, y_train)
y_pred = dt.predict(X_poly_test)
regressor_score = dt.score(X_poly_train, y_train)
# Have a look at R_squared to give an idea of the fit ,
# Explained variance score: 1 is perfect prediction
print('coefficient of determination R^2 of the prediction.: ',regressor_score)
#Decision Tree Regressor
dt_results = cross_val_score(dtm,X, y, cv=kfold)
dt_results
kfm = np.mean(abs(dt_results))
kfstd = dt_results.std()
# The mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)
print('Mean squared error: %.2f'% mse)
# Explained variance score: 1 is perfect prediction
print('Test Variance score(R2 Score): %.2f'% r2)
print('Root Mean Square Error: %.2f'%rmse)
resultsDf = resultsDf[['Method', 'MSE', 'RMSE', 'R2', 'K-Fold-M', 'K-Fold-STD']]
tempResultsDf=pd.DataFrame({'Method':['Decision Tree'],'MSE': mse,'RMSE': rmse,'R2': r2,'K-Fold-M': kfm,'K-Fold-STD':kfstd})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'MSE', 'RMSE', 'R2', 'K-Fold-M', 'K-Fold-STD']]
#resultsDf
# So let's run the model against the test data
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
ax.set_title("Ground Truth vs Predicted")
plt.show()
#Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=9, random_state=0)
# fit the regressor with X and Y data
rfm = rf.fit(X_poly_train, y_train)
y_pred = rf.predict(X_poly_test)
score = rf.score(X_poly_train, y_train)
# Have a look at R_squared to give an idea of the fit ,
# Explained variance score: 1 is perfect prediction
print('coefficient of determination R^2 of the prediction.: ',score)
#RandomForest Regressor
rf_results = cross_val_score(rfm,X, y, cv=kfold)
rf_results
kfm = np.mean(abs(rf_results))
kfstd = rf_results.std()
# The mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)
print('Mean squared error: %.2f'% mse)
# Explained variance score: 1 is perfect prediction
print('Test Variance score(R2 Score): %.2f'% r2)
print('Root Mean Square Error: %.2f'%rmse)
resultsDf = resultsDf[['Method', 'MSE', 'RMSE', 'R2', 'K-Fold-M', 'K-Fold-STD']]
tempResultsDf=pd.DataFrame({'Method':['Random Forest'],'MSE': mse,'RMSE': rmse,'R2': r2,'K-Fold-M': kfm,'K-Fold-STD':kfstd})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'MSE', 'RMSE', 'R2', 'K-Fold-M', 'K-Fold-STD']]
#resultsDf
# So let's run the model against the test data
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
ax.set_title("Ground Truth vs Predicted")
plt.show()
#Scores of the three algorithms
resultsDf
(Creating the model and tuning it)
2. Techniques employed to squeeze that extra performance out of the model without making it over fit. Use Grid Search or Random Search on any of the two models used above. Make a DataFrame to compare models after hyperparameter tuning and their metrics as above. (15 marks)
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
# Look at parameters used by our current Random forest Model
print('Parameters currently in use:\n')
pprint(rf.get_params())
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
pprint(random_grid)
# Use the random grid to search for best hyperparameters
# First create the base model to tune
#rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_poly_train, y_train)
print(rf_random.best_params_)
best_random = rf_random.best_estimator_
best_random
y_pred = best_random.predict(X_poly_test)
score = best_random.score(X_poly_train, y_train)
# The mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)
print('Mean squared error: %.2f'% mse)
# Explained variance score: 1 is perfect prediction
print('Test Variance score(R2 Score): %.2f'% r2)
print('Root Mean Square Error: %.2f'%rmse)
resultsDf = resultsDf[['Method', 'MSE', 'RMSE', 'R2', 'K-Fold-M', 'K-Fold-STD']]
tempResultsDf=pd.DataFrame({'Method':['Random Forest (RandomizedSearchCV)'],'MSE': mse,'RMSE': rmse,'R2': r2,'K-Fold-M': np.nan,'K-Fold-STD':np.nan})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'MSE', 'RMSE', 'R2', 'K-Fold-M', 'K-Fold-STD']]
# So let's run the model against the test data
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
ax.set_title("Ground Truth vs Predicted")
plt.show()
resultsDf
print('Parameters currently in use:\n')
pprint(dt.get_params())
#Tuning Desicion Tree using Grid Search
from sklearn.metrics import make_scorer
scoring = make_scorer(r2_score)
param_grid = [
{'max_depth': range(2,16,2), 'max_features': [3, 4, 5]},
{'random_state': [0, 1, 2, 3, 4], 'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]}
]
g_cv = GridSearchCV(dt,
param_grid=param_grid,
scoring=scoring, cv=5, refit=True)
g_cv.fit(X_poly_train, y_train)
g_cv.best_params_
best_dt = g_cv.best_estimator_
y_pred = best_dt.predict(X_poly_test)
score = best_dt.score(X_poly_train, y_train)
# The mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)
print('Mean squared error: %.2f'% mse)
# Explained variance score: 1 is perfect prediction
print('Test Variance score(R2 Score): %.2f'% r2)
print('Root Mean Square Error: %.2f'%rmse)
resultsDf = resultsDf[['Method', 'MSE', 'RMSE', 'R2', 'K-Fold-M', 'K-Fold-STD']]
tempResultsDf=pd.DataFrame({'Method':['Decision Tree (GridSearchCV)'],'MSE': mse,'RMSE': rmse,'R2': r2,'K-Fold-M': np.nan,'K-Fold-STD':np.nan})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'MSE', 'RMSE', 'R2', 'K-Fold-M', 'K-Fold-STD']]
# So let's run the model against the test data
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
ax.set_title("Ground Truth vs Predicted")
plt.show()
resultsDf